Automate sentiment analysis of textual comments and feedback¶

In [1]:
import re
import string
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import plotly.graph_objects as go

from sklearn import metrics
from sklearn.metrics import ConfusionMatrixDisplay,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
In [2]:
data = pd.read_csv(r"d:\Users\Mayank\Desktop\RIO-125\RIO-125-1\preprocessed_kindle_review .csv")
In [3]:
df=data
df
Out[3]:
Unnamed: 0 rating reviewText summary
0 0 5 This book was the very first bookmobile book I... 50 + years ago...
1 1 1 When I read the description for this book, I c... Boring! Boring! Boring!
2 2 5 I just had to edit this review. This book is a... Wiggleliscious/new toy ready/!!
3 3 5 I don't normally buy 'mystery' novels because ... Very good read.
4 4 5 This isn't the kind of book I normally read, a... Great Story!
... ... ... ... ...
11995 11995 2 Had to read certain passages twice--typos. Wi... Where's the meat?
11996 11996 3 Not what i expected. yet a very interesting bo... Interesting
11997 11997 5 Dragon Knights is a world where Knights ride d... Dragon Knights, Wings of Change (I Dream of Dr...
11998 11998 4 Since this story is very short, it's hard to s... Good writing, short story
11999 11999 4 from 1922 an amazing collection of info on sym... interesting public domain book

12000 rows × 4 columns

In [4]:
# printing dim of the data

data.shape
Out[4]:
(12000, 4)
In [5]:
# displaying all the columns of the dataset

data.columns
Out[5]:
Index(['Unnamed: 0', 'rating', 'reviewText', 'summary'], dtype='object')
In [6]:
# quick review of the dataset

data.head()
Out[6]:
Unnamed: 0 rating reviewText summary
0 0 5 This book was the very first bookmobile book I... 50 + years ago...
1 1 1 When I read the description for this book, I c... Boring! Boring! Boring!
2 2 5 I just had to edit this review. This book is a... Wiggleliscious/new toy ready/!!
3 3 5 I don't normally buy 'mystery' novels because ... Very good read.
4 4 5 This isn't the kind of book I normally read, a... Great Story!
In [7]:
# printing the first review from the dataset

data.reviewText[0]
Out[7]:
'This book was the very first bookmobile book I bought when I was in the school book club. I loved the story then and I bet a dollar to a donut I will love it again. If my memory serves, I bought this book in 5th grade. That would have been about 1961. I am looking forward to reliving the memories.'
In [8]:
# value_counts() function returns object containing counts of unique values. 
# The resulting object will be in descending order so that the first element is the most frequently-occurring element.

a=data.rating.value_counts()
a
Out[8]:
rating
5    3000
4    3000
1    2000
3    2000
2    2000
Name: count, dtype: int64
In [9]:
# checking for null values

data.isnull().sum()
Out[9]:
Unnamed: 0    0
rating        0
reviewText    0
summary       2
dtype: int64
In [10]:
# PLotting Rating histogram

data.rating.hist()
plt.title("Distribution of rating using Matplotlib")
plt.show()
No description has been provided for this image
In [11]:
# PLotting Rating using ploty

fig = go.Figure([go.Bar(x=a.index, y=a.values,text=a.values)])
fig.update_layout(title='Distribution of the Rating using ploty')
fig.show()

Preprocessing Data¶

In [12]:
#Dropping columns that are not needed
data.columns
Out[12]:
Index(['Unnamed: 0', 'rating', 'reviewText', 'summary'], dtype='object')
In [13]:
df = data.drop(['Unnamed: 0', 'summary'], axis=1)
df
Out[13]:
rating reviewText
0 5 This book was the very first bookmobile book I...
1 1 When I read the description for this book, I c...
2 5 I just had to edit this review. This book is a...
3 5 I don't normally buy 'mystery' novels because ...
4 5 This isn't the kind of book I normally read, a...
... ... ...
11995 2 Had to read certain passages twice--typos. Wi...
11996 3 Not what i expected. yet a very interesting bo...
11997 5 Dragon Knights is a world where Knights ride d...
11998 4 Since this story is very short, it's hard to s...
11999 4 from 1922 an amazing collection of info on sym...

12000 rows × 2 columns

In [14]:
# preview of the dataset

data.head()
Out[14]:
Unnamed: 0 rating reviewText summary
0 0 5 This book was the very first bookmobile book I... 50 + years ago...
1 1 1 When I read the description for this book, I c... Boring! Boring! Boring!
2 2 5 I just had to edit this review. This book is a... Wiggleliscious/new toy ready/!!
3 3 5 I don't normally buy 'mystery' novels because ... Very good read.
4 4 5 This isn't the kind of book I normally read, a... Great Story!
In [15]:
#converting rating to 0 and 1 from 1-5
# if rating is above 3 we will consider it as 1 else 0.

data["rating"] = data["rating"].apply(lambda x: 1 if x < 3  else 0) # positive as 0 and negative as 1
data
Out[15]:
Unnamed: 0 rating reviewText summary
0 0 0 This book was the very first bookmobile book I... 50 + years ago...
1 1 1 When I read the description for this book, I c... Boring! Boring! Boring!
2 2 0 I just had to edit this review. This book is a... Wiggleliscious/new toy ready/!!
3 3 0 I don't normally buy 'mystery' novels because ... Very good read.
4 4 0 This isn't the kind of book I normally read, a... Great Story!
... ... ... ... ...
11995 11995 1 Had to read certain passages twice--typos. Wi... Where's the meat?
11996 11996 0 Not what i expected. yet a very interesting bo... Interesting
11997 11997 0 Dragon Knights is a world where Knights ride d... Dragon Knights, Wings of Change (I Dream of Dr...
11998 11998 0 Since this story is very short, it's hard to s... Good writing, short story
11999 11999 0 from 1922 an amazing collection of info on sym... interesting public domain book

12000 rows × 4 columns

In [16]:
# lowering the text of the review

data["reviewText"] = data["reviewText"].str.lower()
data.head()
Out[16]:
Unnamed: 0 rating reviewText summary
0 0 0 this book was the very first bookmobile book i... 50 + years ago...
1 1 1 when i read the description for this book, i c... Boring! Boring! Boring!
2 2 0 i just had to edit this review. this book is a... Wiggleliscious/new toy ready/!!
3 3 0 i don't normally buy 'mystery' novels because ... Very good read.
4 4 0 this isn't the kind of book i normally read, a... Great Story!
In [17]:
# removing punctuation

PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

data["reviewText"] = data["reviewText"].apply(lambda text: remove_punctuation(text))
data.head()
Out[17]:
Unnamed: 0 rating reviewText summary
0 0 0 this book was the very first bookmobile book i... 50 + years ago...
1 1 1 when i read the description for this book i co... Boring! Boring! Boring!
2 2 0 i just had to edit this review this book is an... Wiggleliscious/new toy ready/!!
3 3 0 i dont normally buy mystery novels because i j... Very good read.
4 4 0 this isnt the kind of book i normally read alt... Great Story!
In [18]:
#removing stop words from the dataset

STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

data["reviewText"] = data["reviewText"].apply(lambda text: remove_stopwords(text))
data.head()
Out[18]:
Unnamed: 0 rating reviewText summary
0 0 0 book first bookmobile book bought school book ... 50 + years ago...
1 1 1 read description book couldnt wait read downlo... Boring! Boring! Boring!
2 2 0 edit review book believe got right updated rew... Wiggleliscious/new toy ready/!!
3 3 0 dont normally buy mystery novels dont like how... Very good read.
4 4 0 isnt kind book normally read although try limi... Great Story!
In [19]:
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

data["reviewText"] = data["reviewText"].apply(lambda text: lemmatize_words(text))
data.head()
Out[19]:
Unnamed: 0 rating reviewText summary
0 0 0 book first bookmobile book buy school book clu... 50 + years ago...
1 1 1 read description book couldnt wait read downlo... Boring! Boring! Boring!
2 2 0 edit review book believe get right update rewr... Wiggleliscious/new toy ready/!!
3 3 0 dont normally buy mystery novels dont like how... Very good read.
4 4 0 isnt kind book normally read although try limi... Great Story!

Visualization on Dataset¶

In [20]:
# PLotting Rating histogram

data.rating.hist()
plt.title("Distribution of rating using Matplotlib")
plt.show()
No description has been provided for this image
In [21]:
# PLotting Rating using ploty
a  = data.rating.value_counts()
fig = go.Figure([go.Bar(x=a.index, y=a.values,text=a.values)])
fig.update_layout(title='Distribution of the Rating using ploty')
fig.show()
In [22]:
#Plotting word cloud
text = " ".join(cat.split()[0] for cat in data.reviewText)
word_cloud = WordCloud(collocations = False, background_color = 'white').generate(text)
In [23]:
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
plt.show()
No description has been provided for this image
In [24]:
from collections import Counter
cnt = Counter()
for text in data["reviewText"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(10)
Out[24]:
[('book', 15397),
 ('story', 11027),
 ('read', 10023),
 ('like', 6207),
 ('one', 5949),
 ('character', 5677),
 ('get', 5453),
 ('love', 5059),
 ('good', 4763),
 ('would', 4093)]
In [25]:
cnt.get("book")
Out[25]:
15397
In [26]:
cn = cnt.most_common(10)
w = []
c = []
In [27]:
for i in cn:
    w.append(i[0])
    c.append(i[1])
In [28]:
#Pie chart for Most Frequent Words
fig = px.pie(data, values=c, names=w, color_discrete_sequence=px.colors.sequential.RdBu)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(title="Most Frequent Words")
fig.show()
In [29]:
#Displaying Rarewords
n_rare_words = 11
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
RAREWORDS
Out[29]:
{'1922',
 'backgroung',
 'don8216t',
 'firedrake',
 'gryphon',
 'helos',
 'insite',
 'meaness',
 'relm',
 'symbols',
 'twicetypos'}

Splitting the data¶

In [30]:
# preview of data

data
Out[30]:
Unnamed: 0 rating reviewText summary
0 0 0 book first bookmobile book buy school book clu... 50 + years ago...
1 1 1 read description book couldnt wait read downlo... Boring! Boring! Boring!
2 2 0 edit review book believe get right update rewr... Wiggleliscious/new toy ready/!!
3 3 0 dont normally buy mystery novels dont like how... Very good read.
4 4 0 isnt kind book normally read although try limi... Great Story!
... ... ... ... ...
11995 11995 1 read certain passage twicetypos wish build rel... Where's the meat?
11996 11996 0 expect yet interesting book usually don8216t r... Interesting
11997 11997 0 dragon knight world knight ride dragon slay wi... Dragon Knights, Wings of Change (I Dream of Dr...
11998 11998 0 since story short hard say much without give a... Good writing, short story
11999 11999 0 1922 amazing collection info symbols culture a... interesting public domain book

12000 rows × 4 columns

In [31]:
train, test = train_test_split(data, test_size = 0.3, stratify = data['rating'], random_state = 42)
In [32]:
cv= CountVectorizer(binary=True, min_df = 10, max_df = 0.95)
cv.fit_transform(train['reviewText'].values)
train_feature_set=cv.transform(train['reviewText'].values)
test_feature_set=cv.transform(test['reviewText'].values)
In [33]:
train_feature_set
Out[33]:
<8400x3773 sparse matrix of type '<class 'numpy.int64'>'
	with 322980 stored elements in Compressed Sparse Row format>
In [34]:
train_feature_set.shape[1]
Out[34]:
3773
In [35]:
cv.vocabulary_['book']
Out[35]:
422
In [36]:
y_train = train['rating'].values
y_test = test['rating'].values

Building our Model¶

In [37]:
lr = LogisticRegression(random_state = 42, max_iter=1000)
lr.fit(train_feature_set,y_train)
y_pred = lr.predict(test_feature_set)
print("Accuracy: ",round(metrics.accuracy_score(y_test,y_pred),3))
print("F1: ",round(metrics.f1_score(y_test, y_pred),3))
Accuracy:  0.823
F1:  0.725
In [38]:
cm1 = confusion_matrix(y_test, y_pred)
cm1
Out[38]:
array([[2123,  277],
       [ 360,  840]], dtype=int64)
In [39]:
cm2 = confusion_matrix(y_test, y_pred,normalize='true')
cm2
Out[39]:
array([[0.88458333, 0.11541667],
       [0.3       , 0.7       ]])
In [40]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm1,display_labels=lr.classes_)
disp.plot()
plt.show()
No description has been provided for this image
In [41]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm2,display_labels=lr.classes_)
disp.plot()
plt.show()
No description has been provided for this image
In [42]:
feature_importance = lr.coef_[0][:10]
for i,v in enumerate(feature_importance):
    print('Feature: ', list(cv.vocabulary_.keys())[list(cv.vocabulary_.values()).index(i)], 'Score: ', v)
Feature:  099 Score:  0.18776820123670163
Feature:  10 Score:  0.31980915609027416
Feature:  100 Score:  0.4884943889179449
Feature:  11 Score:  -0.3913364970987692
Feature:  12 Score:  -0.46257777467167416
Feature:  13 Score:  0.8420683409084077
Feature:  14 Score:  -0.6040472434596486
Feature:  15 Score:  0.9190523185535769
Feature:  16 Score:  0.45578601105113226
Feature:  17 Score:  0.43887300859858464
In [43]:
feature_importance = lr.coef_[0]
sorted_idx = np.argsort(feature_importance)
In [44]:
#Top words for the positive class (negative sentiment):

top_10_pos_w = [list(cv.vocabulary_.keys())[list(cv.vocabulary_.values()).index(w)] for w in sorted_idx[range(-1,-11, -1)]]
print(top_10_pos_w)
['waste', 'delete', 'cardboard', 'sorry', 'depress', 'boring', 'thin', '25', 'ugh', 'weird']
In [45]:
fig = plt.figure(figsize=(10, 6))
ax = sns.barplot(x=top_10_pos_w, y=feature_importance[sorted_idx[range(-1,-11, -1)]])
plt.title("Most Important Words Used for Negative Sentiment",fontsize = 20)
x_locs,x_labels = plt.xticks()
plt.setp(x_labels, rotation = 40)
plt.ylabel('Feature Importance', fontsize = 12)
plt.xlabel('Word', fontsize = 12);
No description has been provided for this image
In [46]:
#Top words for the negative class (positive sentiment):
top_10_neg_w = [list(cv.vocabulary_.keys())[list(cv.vocabulary_.values()).index(w)] for w in sorted_idx[:10]]
print(top_10_neg_w)
['enjoyed', 'verne', 'enjoyable', 'shot', 'fun', 'hunter', 'thumb', 'thrill', 'loved', 'hot']
In [47]:
fig = plt.figure(figsize=(10, 6))
ax = sns.barplot(x=top_10_neg_w, y=feature_importance[sorted_idx[:10]])
plt.title("Most Important Words Used for Positive Sentiment",fontsize = 20)
x_locs,x_labels = plt.xticks()
plt.setp(x_labels, rotation = 40)
plt.ylabel('Feature Importance', fontsize = 12)
plt.xlabel('Word', fontsize = 12);
No description has been provided for this image

Prediction on our sentences¶

In [48]:
lr.classes_#negative class first, positive class next
Out[48]:
array([0, 1], dtype=int64)
In [49]:
test_review = cv.transform(["I did not enjoy the book"])
p = lr.predict_proba(test_review)
s = lr.predict(test_review)
print("prob are:",p)
print("prediction are:",s)
prob are: [[0.78883762 0.21116238]]
prediction are: [0]
In [50]:
#hyperparameter
pred_proba_df = pd.DataFrame(lr.predict_proba(test_feature_set))
threshold_list = [0.3,0.4,0.45,0.5]
for i in threshold_list:
    print ('\n******** For i = {} ******'.format(i))
    Y_test_pred = pred_proba_df.applymap(lambda x: 1 if x>i else 0)
    test_f1 = round(metrics.f1_score(y_test, Y_test_pred.loc[:,1].values),3)
    print('F1: {}'.format(test_f1))
******** For i = 0.3 ******
F1: 0.737

******** For i = 0.4 ******
F1: 0.732

******** For i = 0.45 ******
F1: 0.732

******** For i = 0.5 ******
F1: 0.725
C:\Users\Mayank\AppData\Local\Temp\ipykernel_11300\3813260304.py:6: FutureWarning:

DataFrame.applymap has been deprecated. Use DataFrame.map instead.

C:\Users\Mayank\AppData\Local\Temp\ipykernel_11300\3813260304.py:6: FutureWarning:

DataFrame.applymap has been deprecated. Use DataFrame.map instead.

C:\Users\Mayank\AppData\Local\Temp\ipykernel_11300\3813260304.py:6: FutureWarning:

DataFrame.applymap has been deprecated. Use DataFrame.map instead.

C:\Users\Mayank\AppData\Local\Temp\ipykernel_11300\3813260304.py:6: FutureWarning:

DataFrame.applymap has been deprecated. Use DataFrame.map instead.

In [ ]: